In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format
import json

import plotly.figure_factory as ff
import plotly.express as px
import plotly 

%matplotlib inline
In [2]:
data = pd.read_csv('test_data.csv',sep=';',dtype={'userId':object,'clientId':object}, parse_dates=['timestamp']).drop_duplicates()
In [3]:
data
Out[3]:
userId clientId source medium userAgent geoNetwork country countryIsoCode latitude longitude timestamp page_module language index
0 NaN 1079910309.1574080516 google organic Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... {'country': 'Brazil', 'countryIsoCode': 'BR', ... Brazil BR -23.55 -46.63 1574080576940 media en 0
1 NaN 396545562.1574630738 google organic Mozilla/5.0 (iPhone; CPU iPhone OS 13_1_2 like... {'country': 'United States', 'countryIsoCode':... United States US 40.59 -73.66 1574630738696 media en 0
2 NaN 154243967.1574926021 google organic Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple... {'country': 'New Zealand', 'countryIsoCode': '... New Zealand NZ -36.85 174.76 1574926039618 media en 0
3 NaN 383965497.1574801694 google organic Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... {'country': 'Canada', 'countryIsoCode': 'CA', ... Canada CA 45.42 -75.70 1574801694271 media en 0
4 NaN 260420380.1574792849 google organic Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:7... {'country': 'Denmark', 'countryIsoCode': 'DK',... Denmark DK 55.48 8.46 1574792850771 media en 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5827337 54990 782126154.1561557942 NaN NaN Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple... {'country': 'Russia', 'countryIsoCode': 'RU', ... Russia RU 61.26 73.38 1574251337895 tournaments en 5826420
5827338 54990 782126154.1561557942 NaN NaN Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple... {'country': 'Russia', 'countryIsoCode': 'RU', ... Russia RU 61.26 73.38 1574263696734 tournaments en 5826430
5827339 54990 782126154.1561557942 NaN NaN Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple... {'country': 'Russia', 'countryIsoCode': 'RU', ... Russia RU 61.26 73.38 1574265022816 tournaments en 5826431
5827340 54990 1052439947.1574369167 NaN NaN Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... {'country': 'Russia', 'countryIsoCode': 'RU', ... Russia RU 61.26 73.38 1574775027977 tournaments ru 5826740
5827341 54990 1052439947.1574369167 NaN NaN Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl... {'country': 'Russia', 'countryIsoCode': 'RU', ... Russia RU 61.26 73.38 1574775415158 tournaments ru 5826740

5755080 rows × 14 columns

In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5755080 entries, 0 to 5827341
Data columns (total 14 columns):
userId            object
clientId          object
source            object
medium            object
userAgent         object
geoNetwork        object
country           object
countryIsoCode    object
latitude          float64
longitude         float64
timestamp         object
page_module       object
language          object
index             int64
dtypes: float64(2), int64(1), object(11)
memory usage: 658.6+ MB

Here we are getting datetime from timestamp and selecting platform from messy userAgent field. also in dataset we get duplicates of rows but with na in field userId. For such drops we use custom groupby function. Such appoach kills productivity as any custom function.

In [5]:
data['datetime'] = pd.to_datetime(data['timestamp'],unit='ms')
data['date']=data['datetime'].dt.date
data= data.sort_values(['datetime','userId'])
platforms=[]
for platform in ["Windows","Linux","Macintosh","iPad","iPod","iPhone","PlayStation","Android"]:
    data.loc[data.userAgent.apply(lambda x: platform in(x)), 'platform']=platform
data['platform']=data['platform'].fillna(0)
In [6]:
data =data.drop_duplicates(subset=['datetime','userId'])

Here we create few suplementary tables for futher data storage in database if neaded.

In [7]:
#clients table storages clients Id, first detection date and source 
clients= data.groupby('clientId', as_index=False).apply(lambda x: x.sort_values(by='datetime')[:1]).reset_index(drop=True)[['clientId','datetime','source','medium','platform']].copy()
clients.rename({'datetime':'client_first_detection_datetime','source':'client_first_source','medium':'client_first_medium','platform':'client_first_platform'}, axis=1, inplace=True)
#clients.set_index('client_first_detection_datetime',drop=True, inplace=True)
In [8]:
clients.sample(5)
Out[8]:
clientId client_first_detection_datetime client_first_source client_first_medium client_first_platform
36564 1331005410.1575140373 2019-11-30 18:59:33.919 instagram video Android
166911 578717159.1574405769 2019-11-22 06:56:08.541 instagram video Android
131862 262886219.1573481993 2019-11-11 14:19:21.917 yandex search Windows
182082 716372066.1572940579 2019-11-05 07:56:16.148 google organic Android
102508 1930782743.1574899825 2019-11-28 00:10:25.383 google organic Android
In [9]:
# clients_users storages library of clientId to userId
clients_users=data.dropna(subset=['userId']).groupby(['clientId','userId'], as_index=False).agg({'datetime':'min'}).set_index('datetime',drop=True).copy()
In [10]:
clients_users.sample(5)
Out[10]:
clientId userId
datetime
2019-11-15 13:06:38.661 85599606.1573823185 121814
2019-11-29 13:09:34.858 263292670.1575032958 124827
2019-11-02 18:14:43.318 646137265.1572718152 116785
2019-11-03 10:12:47.799 2110607591.1572775649 117004
2019-11-14 10:35:45.835 1390901322.1573723650 121421
In [11]:
data['prev_action_time_spent'] =data.groupby(['clientId'])['datetime'].diff(1)
In [12]:
data['prev_action_platform_same']=data.groupby(['clientId'])['platform'].shift(1)==data.groupby(['clientId'])['platform'].shift(0)
In [13]:
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
fig, ax = plt.subplots(2,sharex=True, figsize=(20,7) )
fig.suptitle('Here we can visually get information about cumulutative time distribution in minutes between clics timedelta (from 1 minute to 1440 (1 day)). Here we can decide from which timedelta we should separate our data to visits. For example we here can set 3 hours which covers 75% of clics')
plt.xticks(np.concatenate((np.arange(0, 900, 30),np.arange(900, 1441, 60))))
sns.distplot(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
                               (data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
                 ['prev_action_time_spent'].astype('timedelta64[m]'), bins=48 ,kde_kws={'cumulative': False}, color="b",ax=ax[1])
sns.kdeplot(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
                               (data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
                 ['prev_action_time_spent'].astype('timedelta64[m]'),cumulative=True, shade=True, color="b",ax=ax[0])
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x147100410c8>
In [14]:
fig = px.histogram(data.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'm'))&
                               (data['prev_action_time_spent']<=np.timedelta64(1, 'D'))]
                   ['prev_action_time_spent'].astype('timedelta64[m]'), x="prev_action_time_spent",
                   marginal="box", histnorm='probability', # or violin, rug
                   title='Absolute time distribution in minutes between clics timedelta (from 1 minute to 1440 (1 day))')
fig.show()

Here we can decide from which timedelta we should separate our data to visits. For example we here can set 1 hour which covers 80% of clics

Without any additional information I decided to consider as new login every action on sites with time diffrence to previous from 1 hour. It covers 80% of all clicks during the day. Additionally we detect new actions from other platform as new login.

City was stored in the geoNetwork field in dictionary. such extraction is very memory utilizing and could easily crush session with out of memory eror thats why I have done it here.

In [15]:
#client_login_logs here we consider as new login every action on sites with time diffrence to previous from 1 hour
client_login_logs=data.groupby(['clientId','platform'], as_index=False).apply(lambda x:x.loc[(data['prev_action_time_spent']>=np.timedelta64(1, 'h'))|(data['prev_action_time_spent'].isnull())|(data['prev_action_platform_same']==False)])
client_login_logs=pd.concat([client_login_logs, client_login_logs["geoNetwork"].apply(lambda x : dict(eval(x)) ).apply(pd.Series)], axis=1)
client_login_logs=client_login_logs.loc[:,~client_login_logs.columns.duplicated()]
client_login_logs=client_login_logs[['clientId','country','city','latitude','longitude','date','datetime','platform']].sort_values('datetime').reset_index(drop=True)
#client_login_logs.set_index('datetime',drop=True, inplace=True)
#client_login_logs=client_login_logs.sort_values('datetime').reset_index(drop=True)
client_login_logs['session_num']=client_login_logs.groupby('clientId').cumcount()
In [16]:
client_login_logs.sample(5)
Out[16]:
clientId country city latitude longitude date datetime platform session_num
54330 639401091.1572944343 Kazakhstan Taraz 42.90 71.40 2019-11-06 2019-11-06 03:10:33.056 Android 1
178847 1894890740.1574007008 Ukraine (not set) nan nan 2019-11-17 2019-11-17 16:10:11.030 Windows 0
28482 255207574.1572543899 Russia Lipetsk 52.61 39.60 2019-11-03 2019-11-03 16:11:29.329 Windows 1
26990 1135170100.1572790525 Russia Novosibirsk 55.01 82.94 2019-11-03 2019-11-03 14:15:25.637 Windows 0
301865 1519233675.1572603292 Ukraine Kyiv 50.45 30.52 2019-11-29 2019-11-29 11:06:17.789 Windows 1

For data analysis and visualisation we connect all gathered data to basic dataframe. client_login_logs table has only datetime of session start. currently we got our set sorted by user and date so we can fill empty values with filling down last value.

In [17]:
data =data.merge(clients,  how='left',on='clientId')
data =data.merge(client_login_logs[['clientId','country','city','date','datetime','platform','session_num']],  how='left',on=['clientId','country','datetime','date','platform'])
data =data.sort_values(by=['clientId', 'session_num','platform'])
data[['session_num','city']]=data[['session_num','city']].fillna(method='ffill')
data['client_first_detection_date']=data['client_first_detection_datetime'].dt.date
data['time_after_aq']=data.datetime-data.client_first_detection_datetime

Here we are going to analyse clients retention.

In [18]:
retended_platform=client_login_logs[(client_login_logs['session_num']!=0)&(client_login_logs['platform']!=0)].groupby(['date','platform','session_num'], as_index=False).agg({'clientId': 'count'}).rename(columns={'clientId': 'retended_clients'})
first_platform=client_login_logs[(client_login_logs['session_num']==0)&(client_login_logs['platform']!=0)].groupby(['date','platform'], as_index=False).agg({'clientId': 'nunique'}).rename(columns={'clientId': 'new_clients'})
all_visits_platform=client_login_logs[(client_login_logs['platform']!=0)].groupby(['date','platform','session_num'], as_index=False).agg({'clientId': 'count'}).rename(columns={'clientId': 'all_clients'})
In [19]:
fig = px.bar(all_visits_platform.groupby(['date','platform'], as_index=False).sum(), x='date', 
             y='all_clients',  color='platform', title="Absolute volume of visits by platform.")
fig.show()

fig = px.bar(first_platform, x='date', 
             y='new_clients',  color='platform', title="Absolute volume of new visits by platform. Here we count first visit for all clients")
fig.show()

fig = px.bar(retended_platform.groupby(['date','platform'], as_index=False).sum(), x='date', 
             y='retended_clients',  color='platform', title="Absolute volume of retended visits by platform. Here we count all visits exept first for all clients")
fig.show()
In [20]:
client_login_logs.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322096 entries, 0 to 322095
Data columns (total 9 columns):
clientId       322096 non-null object
country        322096 non-null object
city           322096 non-null object
latitude       316428 non-null float64
longitude      316428 non-null float64
date           322096 non-null object
datetime       322096 non-null datetime64[ns]
platform       322096 non-null object
session_num    322096 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 22.1+ MB
In [21]:
#We detect retention by first used platform for correct retention rates.
retention_pivot=client_login_logs.loc[(client_login_logs['session_num']<=10) &(client_login_logs['platform']!=0)
                     ].merge(clients, on='clientId').pivot_table(
                    values='clientId', index=[ 'client_first_platform'],
                    columns=['session_num'], aggfunc=pd.Series.nunique
                    ).reset_index()
retention_pivot[[ 'ret '+str(x)+' to 1st' for x in [*range(1,11,1)]]]=retention_pivot[[*range(1,11,1)]].div(retention_pivot[0], axis=0)
In [22]:
print('Here we can se retention rate from 1st retention till 10th vs first visit')
retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to 1st' for x in [*range(1,11,1)]]]
Here we can se retention rate from 1st retention till 10th vs first visit
Out[22]:
session_num client_first_platform ret 1 to 1st ret 2 to 1st ret 3 to 1st ret 4 to 1st ret 5 to 1st ret 6 to 1st ret 7 to 1st ret 8 to 1st ret 9 to 1st ret 10 to 1st
0 Android 0.14 0.05 0.02 0.01 0.01 0.01 0.01 0.00 0.00 0.00
1 Linux 0.13 0.06 0.05 0.04 0.03 0.03 0.03 0.03 0.02 0.02
2 Macintosh 0.15 0.06 0.04 0.02 0.02 0.02 0.02 0.01 0.01 0.01
3 PlayStation 0.12 0.03 nan nan nan nan nan nan nan nan
4 Windows 0.20 0.10 0.07 0.05 0.04 0.04 0.03 0.03 0.02 0.02
5 iPad 0.12 0.04 0.02 0.01 0.01 0.00 0.00 0.00 0.00 0.00
6 iPhone 0.11 0.03 0.01 0.01 0.01 0.00 0.00 0.00 0.00 0.00
In [23]:
#retention_by_platform_detection_date give us retentions in break by platform. 
#We detect retention by first used platform for correct retention rates.
retention_by_platform_detection_date= client_login_logs[(client_login_logs['session_num']==0)&(client_login_logs['platform']!=0)][['clientId','platform','date','datetime','session_num']].merge(client_login_logs[(client_login_logs['session_num']!=0)&(client_login_logs['platform']!=0)][['clientId','session_num']],on='clientId').groupby(['date','platform','session_num_y'],as_index=False).agg({'clientId':'nunique'}).rename(columns={'session_num_y':'session_num','clientId': 'retended_clients'})
retention_rate_by_platform_detection_date = first_platform.merge(retention_by_platform_detection_date, how='left', on=['platform','date']).fillna(0).rename(columns={'date': 'client_first_detection_date'})
retention_rate_by_platform_detection_date['retention_rate']=retention_rate_by_platform_detection_date.retended_clients/retention_rate_by_platform_detection_date.new_clients
retention_rate_by_platform_first_visits_detection_date=retention_rate_by_platform_detection_date.loc[(retention_rate_by_platform_detection_date['session_num']<=5)&(retention_rate_by_platform_detection_date['session_num']>=1)]
retention_rate_by_platform_first_visits_detection_date['session_num']=retention_rate_by_platform_first_visits_detection_date['session_num'].astype(str)

#retention rate by platform
retention_rate_by_platform=retention_rate_by_platform_detection_date[['platform','session_num','retended_clients']].groupby(['platform','session_num'],as_index=False).sum().merge( first_platform.groupby(['platform']).sum(),how='left',on='platform')
retention_rate_by_platform['retention_rate']=retention_rate_by_platform['retended_clients']/retention_rate_by_platform['new_clients']
retention_rate_by_platform_first_visits=retention_rate_by_platform.loc[(retention_rate_by_platform['session_num']<=5 )& (retention_rate_by_platform['session_num']>=1)]
retention_rate_by_platform_first_visits['session_num']=retention_rate_by_platform_first_visits['session_num'].astype(str)
In [24]:
fig = px.line(retention_rate_by_platform.loc[(retention_rate_by_platform['session_num']<=10)&(retention_rate_by_platform['session_num']>=1)], 
              x='session_num', y='retention_rate', color="platform", hover_data=['new_clients','retention_rate','session_num'], title="Retention rate by platform for first 10 visits vs 1st visit")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
In [25]:
for x in [*range(0,10,1)]:
    retention_pivot[ 'ret '+str(x+1)+' to prev']=(retention_pivot[x+1]/retention_pivot[x])
In [26]:
print("Here we can see that conversion after first 3 visits stabilises so we can say that after 3 visit firs platform has no matter for futher clients retentions")
retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to prev' for x in [*range(1,11,1)]]]
Here we can see that conversion after first 3 visits stabilises so we can say that after 3 visit firs platform has no matter for futher clients retentions
Out[26]:
session_num client_first_platform ret 1 to prev ret 2 to prev ret 3 to prev ret 4 to prev ret 5 to prev ret 6 to prev ret 7 to prev ret 8 to prev ret 9 to prev ret 10 to prev
0 Android 0.14 0.32 0.49 0.62 0.69 0.76 0.78 0.83 0.77 0.81
1 Linux 0.13 0.47 0.80 0.73 0.97 0.94 0.94 0.93 0.82 0.87
2 Macintosh 0.15 0.40 0.62 0.68 0.83 0.88 0.91 0.86 0.84 0.98
3 PlayStation 0.12 0.24 nan nan nan nan nan nan nan nan
4 Windows 0.20 0.51 0.68 0.77 0.80 0.83 0.85 0.87 0.88 0.89
5 iPad 0.12 0.31 0.46 0.61 0.55 0.67 0.50 1.00 1.00 1.00
6 iPhone 0.11 0.27 0.46 0.61 0.64 0.72 0.75 0.75 0.82 0.81
In [27]:
fig = px.line(retention_pivot[['client_first_platform']+[ 'ret '+str(x)+' to prev' for x in [*range(1,11,1)]]].melt(id_vars=['client_first_platform'],value_name='retention_rate'), 
              x='session_num', y='retention_rate', color="client_first_platform", hover_data=['retention_rate','session_num'], title="Retention rate by platform for first 10 visits vs previous visit")
fig.update_xaxes(rangeslider_visible=True)
fig.show()
In [28]:
for name, group in retention_rate_by_platform_first_visits_detection_date.groupby('platform'):
    fig = px.bar(group, x='client_first_detection_date', 
             y='retention_rate',  color='session_num', title="Retention rate of visits by detection date for "+name)
    fig.show()